Show the code
import pandas as pd
import polars as pl
import numpy as np
from lets_plot import *
# add the additional libraries you need to import for ML here
LetsPlot.setup_html(isolated_frame=True)import pandas as pd
import polars as pl
import numpy as np
from lets_plot import *
# add the additional libraries you need to import for ML here
LetsPlot.setup_html(isolated_frame=True)# import your data here using pandas and the URL
url = "https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv"
df = pl.read_csv("StarWars.csv")Shorten the column names and clean them up for easier use with pandas. Provide a table or list that exemplifies how you fixed the names.
Many of the names were manually renamed, other names were taken from the column details and adapted to lower case and had the spaces replaced with underscores.
# Include and execute your code here
# Fix UTF encoding
# Create a readme/data.md using the og column names
df_clean = df.rename({
df.columns[1]: "seen",
df.columns[2]: "fan",
df.columns[3]: "seen_epi_i",
df.columns[4]: "seen_epi_ii",
df.columns[5]: "seen_epi_iii",
df.columns[6]: "seen_epi_iv",
df.columns[7]: "seen_epi_v",
df.columns[8]: "seen_epi_vi",
df.columns[9]: "rank_epi_i",
df.columns[10]: "rank_epi_ii",
df.columns[11]: "rank_epi_iii",
df.columns[12]: "rank_epi_iv",
df.columns[13]: "rank_epi_v",
df.columns[14]: "rank_epi_vi",
df.columns[15]: df[df.columns[15]][0].lower().replace(' ', '_'),
df.columns[16]: df[df.columns[16]][0].lower().replace(' ', '_'),
df.columns[17]: df[df.columns[17]][0].lower().replace(' ', '_'),
df.columns[18]: df[df.columns[18]][0].lower().replace(' ', '_'),
df.columns[19]: df[df.columns[19]][0].lower().replace(' ', '_'),
df.columns[20]: df[df.columns[20]][0].lower().replace(' ', '_'),
df.columns[21]: df[df.columns[21]][0].lower().replace(' ', '_'),
df.columns[22]: df[df.columns[22]][0].lower().replace(' ', '_'),
df.columns[23]: df[df.columns[23]][0].lower().replace(' ', '_'),
df.columns[24]: df[df.columns[24]][0].lower().replace(' ', '_'),
df.columns[25]: df[df.columns[25]][0].lower().replace(' ', '_'),
df.columns[26]: df[df.columns[26]][0].lower().replace(' ', '_'),
df.columns[27]: df[df.columns[27]][0].lower().replace(' ', '_'),
df.columns[28]: df[df.columns[28]][0].lower().replace(' ', '_'),
df.columns[29]: "shot_first",
df.columns[30]: "ex_uni",
df.columns[31]: "fan_ex_uni",
df.columns[32]: "fan_star_trek",
df.columns[33]: df.columns[33].lower().replace(' ', '_'),
df.columns[34]: df.columns[34].lower().replace(' ', '_'),
df.columns[35]: df.columns[35].lower().replace(' ', '_'),
df.columns[36]: df.columns[36].lower().replace(' ', '_'),
df.columns[37]: "location"
})
display(df.head())
df_clean = df_clean[1:]| RespondentID | Have you seen any of the 6 films in the Star Wars franchise? | Do you consider yourself to be a fan of the Star Wars film franchise? | Which of the following Star Wars films have you seen? Please select all that apply. | _duplicated_0 | _duplicated_1 | _duplicated_2 | _duplicated_3 | Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film. | _duplicated_4 | _duplicated_5 | _duplicated_6 | _duplicated_7 | _duplicated_8 | Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her. | _duplicated_9 | _duplicated_10 | _duplicated_11 | _duplicated_12 | _duplicated_13 | _duplicated_14 | _duplicated_15 | _duplicated_16 | _duplicated_17 | _duplicated_18 | _duplicated_19 | _duplicated_20 | _duplicated_21 | Which character shot first? | Are you familiar with the Expanded Universe? | Do you consider yourself to be a fan of the Expanded Universe?�� | Do you consider yourself to be a fan of the Star Trek franchise? | Gender | Age | Household Income | Education | Location (Census Region) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str | str |
| null | "Response" | "Response" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | "Star Wars: Episode IV A New H… | "Star Wars: Episode V The Empir… | "Star Wars: Episode VI Return o… | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | "Star Wars: Episode IV A New H… | "Star Wars: Episode V The Empir… | "Star Wars: Episode VI Return o… | "Han Solo" | "Luke Skywalker" | "Princess Leia Organa" | "Anakin Skywalker" | "Obi Wan Kenobi" | "Emperor Palpatine" | "Darth Vader" | "Lando Calrissian" | "Boba Fett" | "C-3P0" | "R2 D2" | "Jar Jar Binks" | "Padme Amidala" | "Yoda" | "Response" | "Response" | "Response" | "Response" | "Response" | "Response" | "Response" | "Response" | "Response" |
| 3292879998 | "Yes" | "Yes" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | "Star Wars: Episode IV A New H… | "Star Wars: Episode V The Empir… | "Star Wars: Episode VI Return o… | "3" | "2" | "1" | "4" | "5" | "6" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "I don't understand this questi… | "Yes" | "No" | "No" | "Male" | "18-29" | null | "High school degree" | "South Atlantic" |
| 3292879538 | "No" | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | null | "Yes" | "Male" | "18-29" | "$0 - $24,999" | "Bachelor degree" | "West South Central" |
| 3292765271 | "Yes" | "No" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | null | null | null | "1" | "2" | "3" | "4" | "5" | "6" | "Somewhat favorably" | "Somewhat favorably" | "Somewhat favorably" | "Somewhat favorably" | "Somewhat favorably" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "Unfamiliar (N/A)" | "I don't understand this questi… | "No" | null | "No" | "Male" | "18-29" | "$0 - $24,999" | "High school degree" | "West North Central" |
| 3292763116 | "Yes" | "Yes" | "Star Wars: Episode I The Phan… | "Star Wars: Episode II Attack … | "Star Wars: Episode III Reveng… | "Star Wars: Episode IV A New H… | "Star Wars: Episode V The Empir… | "Star Wars: Episode VI Return o… | "5" | "6" | "1" | "2" | "4" | "3" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Somewhat favorably" | "Very favorably" | "Somewhat favorably" | "Somewhat unfavorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "Very favorably" | "I don't understand this questi… | "No" | null | "Yes" | "Male" | "18-29" | "$100,000 - $149,999" | "Some college or Associate degr… | "West North Central" |
Filter the dataset to 835 respondents that have seen at least one film (Hint: Don’t use the column Have you seen any of the 6 films in the Star Wars franchise?) Not much to show here by way of output. Print the shape (i.e. number of rows and number of columns) of the filtered dataset.
Gathering the shape of the seen movies, the count is 835.
# Include and execute your code here
seen_cols = ["seen_epi_i", "seen_epi_ii", "seen_epi_iii", "seen_epi_iv", "seen_epi_v", "seen_epi_vi",]
df_seen = df_clean.filter(
(pl.col(df_clean.columns[3]).is_not_null())
| (pl.col(df_clean.columns[4]).is_not_null())
| (pl.col(df_clean.columns[5]).is_not_null())
| (pl.col(df_clean.columns[6]).is_not_null())
| (pl.col(df_clean.columns[7]).is_not_null())
| (pl.col(df_clean.columns[8]).is_not_null())
).select(seen_cols)
movies = ["The Phantom Menace", "Attack of the Clones", "Revenge of the Sith", "A New Hope", "The Empire Strikes Back", "Return of the Jedi"]
seen_counts = [df_seen[col].count() for col in seen_cols]
seen_percs = [i / df_seen.shape[0] for i in seen_counts]
df_percs = pl.DataFrame({"movie": movies[::-1], "percentage": seen_percs[::-1]})
df_percs = df_percs.with_columns(
((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
)
rank_cols = ["rank_epi_i", "rank_epi_ii", "rank_epi_iii", "rank_epi_iv", "rank_epi_v", "rank_epi_vi",]
df_rank = df_clean.filter(
(pl.col(df_clean.columns[3]).is_not_null())
& (pl.col(df_clean.columns[4]).is_not_null())
& (pl.col(df_clean.columns[5]).is_not_null())
& (pl.col(df_clean.columns[6]).is_not_null())
& (pl.col(df_clean.columns[7]).is_not_null())
& (pl.col(df_clean.columns[8]).is_not_null())
).select(rank_cols)
# Episode 3 has 1 missing rank (when compared to the other movies the rank option left is 6)
df_rank = df_rank.with_columns(pl.col('rank_epi_iii').fill_null("6"))
rank_counts = [df_rank.filter(pl.col(i) == "1").height for i in rank_cols]
# .height = .shape[0] in polars
rank_percs = [i / df_rank.height for i in rank_counts]
df_rank_percs = pl.DataFrame({"movie": movies[::-1], "percentage": rank_percs[::-1]})
df_rank_percs = df_rank_percs.with_columns(
((pl.col("percentage") * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias('perc_label')
)
print(f"The number of respondents that have seen at least one movie: {df_seen.height}")The number of respondents that have seen at least one movie: 835
Validate that the data provided on GitHub lines up with the article by recreating 2 of the visuals from the article. These visuals should be similar, but don’t have to be exact. They need to be close enough that we can validate that the values in the dataset match the graphs in the chart. Though their charts were built using a different plotting software, the more you push yourself for an exact replica, the more you will learn. Spend at least a couple of hours on this.
The Empire Strikes Back is the most seen film of the saga. It also happens to be the favorite.
# Include and execute your code here
movie_graph = (
ggplot(data=df_percs)
+ geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
+ labs(
title="Which 'Star Wars' Movie's Have You Seen?",
subtitle="Of 835 respondents who have seen any film",
x='',
y=''
)
+ scale_x_continuous(limits=[0,1])
+ geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.075, size=12, color='black')
+ theme(
panel_background=element_rect(fill='gray', linetype=0),
plot_background=element_rect(fill='gray'),
# panel_grid_major=element_rect(fill='gray'),
panel_grid=element_blank(),
legend_background=element_rect(fill='gray'),
axis_text=element_text(color='black', size=18),
# axis_title=element_text(color='white'),
plot_title=element_text(color='black', face="bold", hjust=0, size=25),
plot_subtitle=element_text(color='black', hjust=0, size=20),
legend_text=element_text(color='white'),
legend_title=element_text(color='white'),
label_text=element_text(color='white'),
axis_line_x=element_blank(),
axis_ticks_x=element_blank(),
axis_text_x=element_blank(),
plot_title_position='plot'
)
+ ggsize(1600, 900)
)
display(movie_graph)rank_graph = (
ggplot(data=df_rank_percs)
+ geom_bar(mapping=aes(x='percentage', y='movie'), stat='identity', orientation='y', color="lightblue", fill="lightblue")
+ labs(
title="What's the Best 'Star Wars' Movie?",
subtitle="Of 741 respondents who have seen all 6 films",
x='',
y=''
)
+ scale_x_continuous(limits=[0,0.5])
+ geom_text(aes(x='percentage', y='movie', label='perc_label'), nudge_x=0.075, size=12, color='black')
+ theme(
panel_background=element_rect(fill='gray', linetype=0),
plot_background=element_rect(fill='gray'),
panel_grid=element_blank(),
legend_background=element_rect(fill='gray'),
axis_text=element_text(color='black', size=18),
plot_title=element_text(color='black', face="bold", hjust=0, size=25),
plot_subtitle=element_text(color='black', hjust=0, size=20),
legend_text=element_text(color='white'),
legend_title=element_text(color='white'),
label_text=element_text(color='white'),
axis_line_x=element_blank(),
axis_ticks_x=element_blank(),
axis_text_x=element_blank(),
plot_title_position='plot'
)
+ ggsize(1600, 900)
)
display(rank_graph)